Importing Dataset & Libraries

library(cluster)
library(tidyverse) 
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
library(factoextra) 
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.3
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.3.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(waffle)
## Warning: package 'waffle' was built under R version 4.3.3
library(dplyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
custom_red <- "#f8766d"
custom_blue <- "#00bfc4"
hotel_data <- read.csv("C:/Users/mamid/Downloads/Hotel Reservations.csv")
head(hotel_data)
dim(hotel_data)
## [1] 36275    19

Data Cleaning

str(hotel_data)
## 'data.frame':    36275 obs. of  19 variables:
##  $ Booking_ID                          : chr  "INN00001" "INN00002" "INN00003" "INN00004" ...
##  $ no_of_adults                        : int  2 2 1 2 2 2 2 2 3 2 ...
##  $ no_of_children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_weekend_nights                : int  1 2 2 0 1 0 1 1 0 0 ...
##  $ no_of_week_nights                   : int  2 3 1 2 1 2 3 3 4 5 ...
##  $ required_car_parking_space          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lead_time                           : int  224 5 1 211 48 346 34 83 121 44 ...
##  $ arrival_year                        : int  2017 2018 2018 2018 2018 2018 2017 2018 2018 2018 ...
##  $ arrival_month                       : int  10 11 2 5 4 9 10 12 7 10 ...
##  $ arrival_date                        : int  2 6 28 20 11 13 15 26 6 18 ...
##  $ repeated_guest                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ avg_price_per_room                  : num  65 106.7 60 100 94.5 ...
##  $ no_of_special_requests              : int  0 1 0 0 0 1 1 1 1 3 ...
##  $ room_type_reserved                  : chr  "Room_Type 1" "Room_Type 1" "Room_Type 1" "Room_Type 1" ...
##  $ type_of_meal_plan                   : chr  "Meal Plan 1" "Not Selected" "Meal Plan 1" "Meal Plan 1" ...
##  $ market_segment_type                 : chr  "Offline" "Online" "Online" "Online" ...
##  $ booking_status                      : chr  "Not_Canceled" "Not_Canceled" "Canceled" "Canceled" ...
summary(hotel_data)
##   Booking_ID         no_of_adults   no_of_children    no_of_weekend_nights
##  Length:36275       Min.   :0.000   Min.   : 0.0000   Min.   :0.0000      
##  Class :character   1st Qu.:2.000   1st Qu.: 0.0000   1st Qu.:0.0000      
##  Mode  :character   Median :2.000   Median : 0.0000   Median :1.0000      
##                     Mean   :1.845   Mean   : 0.1053   Mean   :0.8107      
##                     3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.:2.0000      
##                     Max.   :4.000   Max.   :10.0000   Max.   :7.0000      
##  no_of_week_nights required_car_parking_space   lead_time       arrival_year 
##  Min.   : 0.000    Min.   :0.00000            Min.   :  0.00   Min.   :2017  
##  1st Qu.: 1.000    1st Qu.:0.00000            1st Qu.: 17.00   1st Qu.:2018  
##  Median : 2.000    Median :0.00000            Median : 57.00   Median :2018  
##  Mean   : 2.204    Mean   :0.03099            Mean   : 85.23   Mean   :2018  
##  3rd Qu.: 3.000    3rd Qu.:0.00000            3rd Qu.:126.00   3rd Qu.:2018  
##  Max.   :17.000    Max.   :1.00000            Max.   :443.00   Max.   :2018  
##  arrival_month     arrival_date  repeated_guest    no_of_previous_cancellations
##  Min.   : 1.000   Min.   : 1.0   Min.   :0.00000   Min.   : 0.00000            
##  1st Qu.: 5.000   1st Qu.: 8.0   1st Qu.:0.00000   1st Qu.: 0.00000            
##  Median : 8.000   Median :16.0   Median :0.00000   Median : 0.00000            
##  Mean   : 7.424   Mean   :15.6   Mean   :0.02564   Mean   : 0.02335            
##  3rd Qu.:10.000   3rd Qu.:23.0   3rd Qu.:0.00000   3rd Qu.: 0.00000            
##  Max.   :12.000   Max.   :31.0   Max.   :1.00000   Max.   :13.00000            
##  no_of_previous_bookings_not_canceled avg_price_per_room no_of_special_requests
##  Min.   : 0.0000                      Min.   :  0.00     Min.   :0.0000        
##  1st Qu.: 0.0000                      1st Qu.: 80.30     1st Qu.:0.0000        
##  Median : 0.0000                      Median : 99.45     Median :0.0000        
##  Mean   : 0.1534                      Mean   :103.42     Mean   :0.6197        
##  3rd Qu.: 0.0000                      3rd Qu.:120.00     3rd Qu.:1.0000        
##  Max.   :58.0000                      Max.   :540.00     Max.   :5.0000        
##  room_type_reserved type_of_meal_plan  market_segment_type booking_status    
##  Length:36275       Length:36275       Length:36275        Length:36275      
##  Class :character   Class :character   Class :character    Class :character  
##  Mode  :character   Mode  :character   Mode  :character    Mode  :character  
##                                                                              
##                                                                              
## 
skim(hotel_data)
Data summary
Name hotel_data
Number of rows 36275
Number of columns 19
_______________________
Column type frequency:
character 5
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Booking_ID 0 1 8 8 0 36275 0
room_type_reserved 0 1 11 11 0 7 0
type_of_meal_plan 0 1 11 12 0 4 0
market_segment_type 0 1 6 13 0 5 0
booking_status 0 1 8 12 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
no_of_adults 0 1 1.84 0.52 0 2.0 2.00 2 4 ▁▂▇▁▁
no_of_children 0 1 0.11 0.40 0 0.0 0.00 0 10 ▇▁▁▁▁
no_of_weekend_nights 0 1 0.81 0.87 0 0.0 1.00 2 7 ▇▃▁▁▁
no_of_week_nights 0 1 2.20 1.41 0 1.0 2.00 3 17 ▇▁▁▁▁
required_car_parking_space 0 1 0.03 0.17 0 0.0 0.00 0 1 ▇▁▁▁▁
lead_time 0 1 85.23 85.93 0 17.0 57.00 126 443 ▇▃▁▁▁
arrival_year 0 1 2017.82 0.38 2017 2018.0 2018.00 2018 2018 ▂▁▁▁▇
arrival_month 0 1 7.42 3.07 1 5.0 8.00 10 12 ▃▃▅▆▇
arrival_date 0 1 15.60 8.74 1 8.0 16.00 23 31 ▇▇▇▆▆
repeated_guest 0 1 0.03 0.16 0 0.0 0.00 0 1 ▇▁▁▁▁
no_of_previous_cancellations 0 1 0.02 0.37 0 0.0 0.00 0 13 ▇▁▁▁▁
no_of_previous_bookings_not_canceled 0 1 0.15 1.75 0 0.0 0.00 0 58 ▇▁▁▁▁
avg_price_per_room 0 1 103.42 35.09 0 80.3 99.45 120 540 ▇▅▁▁▁
no_of_special_requests 0 1 0.62 0.79 0 0.0 0.00 1 5 ▇▁▁▁▁

##Checking Null Values & Duplicated Rows

colSums(is.na(hotel_data))
##                           Booking_ID                         no_of_adults 
##                                    0                                    0 
##                       no_of_children                 no_of_weekend_nights 
##                                    0                                    0 
##                    no_of_week_nights           required_car_parking_space 
##                                    0                                    0 
##                            lead_time                         arrival_year 
##                                    0                                    0 
##                        arrival_month                         arrival_date 
##                                    0                                    0 
##                       repeated_guest         no_of_previous_cancellations 
##                                    0                                    0 
## no_of_previous_bookings_not_canceled                   avg_price_per_room 
##                                    0                                    0 
##               no_of_special_requests                   room_type_reserved 
##                                    0                                    0 
##                    type_of_meal_plan                  market_segment_type 
##                                    0                                    0 
##                       booking_status 
##                                    0
duplicate_rows <- duplicated(hotel_data)
hotel_data[duplicate_rows, ]

No duplicate entries found.

Number of unique values per column (including categorical variables)

sapply(hotel_data, function(x) length(unique(x)))
##                           Booking_ID                         no_of_adults 
##                                36275                                    5 
##                       no_of_children                 no_of_weekend_nights 
##                                    6                                    8 
##                    no_of_week_nights           required_car_parking_space 
##                                   18                                    2 
##                            lead_time                         arrival_year 
##                                  352                                    2 
##                        arrival_month                         arrival_date 
##                                   12                                   31 
##                       repeated_guest         no_of_previous_cancellations 
##                                    2                                    9 
## no_of_previous_bookings_not_canceled                   avg_price_per_room 
##                                   59                                 3930 
##               no_of_special_requests                   room_type_reserved 
##                                    6                                    7 
##                    type_of_meal_plan                  market_segment_type 
##                                    4                                    5 
##                       booking_status 
##                                    2

Unique values for non-numerical columns:

table(hotel_data$type_of_meal_plan)
## 
##  Meal Plan 1  Meal Plan 2  Meal Plan 3 Not Selected 
##        27835         3305            5         5130
table(hotel_data$room_type_reserved)
## 
## Room_Type 1 Room_Type 2 Room_Type 3 Room_Type 4 Room_Type 5 Room_Type 6 
##       28130         692           7        6057         265         966 
## Room_Type 7 
##         158
table(hotel_data$market_segment_type)
## 
##      Aviation Complementary     Corporate       Offline        Online 
##           125           391          2017         10528         23214
table(hotel_data$booking_status)
## 
##     Canceled Not_Canceled 
##        11885        24390

Data Transformation

Removing ID Column

hotel_data <- subset(hotel_data, select = -Booking_ID)

Transforming Target Column

Transforming the “booking_status” column to “canceled” and using Boolean variable types.

names(hotel_data)[names(hotel_data) == "booking_status"] <- "canceled"
hotel_data$canceled <- ifelse(hotel_data$canceled == "Canceled", TRUE, FALSE)
head(hotel_data["canceled"])

Column type has been transformed to Logical, aka Boolean.

#Transforming Columns to Logical Type

Converting the “repeated_guest” and “required_car_parking_space” columns to Boolean variables.

hotel_data$repeated_guest <- ifelse(hotel_data$repeated_guest == 1, TRUE, FALSE)
hotel_data$required_car_parking_space <- ifelse(hotel_data$required_car_parking_space == 1, TRUE, FALSE)
head(select(hotel_data,repeated_guest,required_car_parking_space))

#Transforming Columns to Numerical Type

#Room Type Reserved Column

Transforming the “room_type_reserved” column into an integer representation of room types by replacing “Room_Type” with an empty character using the gsub() function.

hotel_data$room_type_reserved <- gsub("Room_Type ", "", hotel_data$room_type_reserved) # Replace "Room_Type " with empty char
head(hotel_data["room_type_reserved"])

The column is still Char, updating it to integer type

hotel_data$room_type_reserved <- as.integer(hotel_data$room_type_reserved)
print(typeof(hotel_data$room_type_reserved))
## [1] "integer"

#Type of Meal Plan Column

hotel_data$type_of_meal_plan <- gsub("Not Selected", 0, hotel_data$type_of_meal_plan) # Replace "Not Selected" with 0
hotel_data$type_of_meal_plan <- gsub("Meal Plan ", "", hotel_data$type_of_meal_plan)
hotel_data$type_of_meal_plan <- as.integer(hotel_data$type_of_meal_plan)
head(hotel_data["type_of_meal_plan"])

#Merging Date Columns in a Single One

Introducing a new column, ‘date’, formatted as a Date type. This column will prove valuable for future analytics purposes.

hotel_data <- cbind(hotel_data[, 1:11], date = as.Date(paste(hotel_data$arrival_date, hotel_data$arrival_month, hotel_data$arrival_year, sep="-"), format="%d-%m-%Y"), hotel_data[, 12:ncol(hotel_data)])
subset_data <- subset(hotel_data, is.na(date), c(arrival_year, arrival_month, arrival_date, date))

An issue was discovered in the dataset: February 29th is invalid in 2018 as it was not a leap year. To address this, all rows corresponding to this non-existent date will be removed from the original dataset.

hotel_data <- hotel_data[complete.cases(hotel_data$date), ]

The changes have been implemented successfully, resulting in a dataset containing 25,965 rows.

Exploratory Data Analysis

#Distribution of Canceled Bookings

hotel_data_plot <- ggplot(hotel_data, aes(x = canceled, fill = canceled)) + 
             geom_bar() + 
             geom_text(stat='count', aes(label=after_stat(count)), vjust=-0.64) +
             theme_void() +
             guides(fill = "none")

hotel_pie_chart <- ggplot(hotel_data, aes(x = "", fill = canceled)) +
            geom_bar(width = 1) +
            coord_polar(theta = "y") +
            guides(fill = guide_legend(title = "Canceled", ncol = 1)) +
            geom_text(aes(label = paste0(round((after_stat(count))/sum(after_stat(count)) * 100, 2), "%")),
            stat = "count", 
            position = position_stack(vjust = 0.5)) +
            theme_void() +
            theme(legend.position = "bottom")

grid.arrange(hotel_data_plot, 
             hotel_pie_chart, 
             ncol = 2, widths = c(4, 3.5), top = "Distribution of Canceled Bookings")

Out of the total number of bookings (25,965), only 7,435 (28.63%) were canceled, while 18,530 (71.37%) reservations were confirmed.

#Variation of the Average Price per Room

ggplot(hotel_data, aes(x = date, y = avg_price_per_room)) + 
  geom_smooth(method="auto") +
  geom_smooth(method="lm",color="red")+
  labs(x = "Month", y = "Average Price per Room") +
  ggtitle("Variation of Average Price per Room over Time (2017-2018)") +
  scale_x_date(date_breaks = "1 month", date_labels = "%m")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'

Over time, we see a steady increase in the average accommodation price, with two significant peaks around May/June 2018 and September 2018. Although it peaked much lower than in 2018, there was still a price increase in September 2017. The early months of the year, from January to mid-February, are usually when the prices are lowest. The link between supply and demand is clearly shown in this chart, where prices tend to grow in the summer and around September because of strong demand, but they stay relatively lower at the beginning of the year because of weaker demand.

Variation of Bookings Count

ggplot(hotel_data, aes(x = date)) + 
  geom_bar(aes(fill = canceled)) +
  geom_density(data = subset(hotel_data, canceled == TRUE), aes(y = after_stat(count)),linewidth=0.8)+
  labs(x = "Date", y = "Count", fill = "Canceled") +
  ggtitle("Variation of Reservations count over Time (2017-2018)") +
  theme(legend.position = "bottom")+
  scale_x_date(date_breaks = "1 month", date_labels = "%m")

The graph displays the evolution of reservations over time, encompassing both canceled and uncanceled bookings. It exhibits a pattern akin to that of the average room price variance, which can be attributed to variations in demand throughout the year.

Reservations tend to be accompanied by an increase in cancellations. We see an increase in cancellations beginning in February, which peaks modestly in mid-April, declines slightly in June and July, and peaks significantly in mid-August to mid-September. By year’s conclusion, cancellations begin to decline once more. Furthermore, there aren’t many cancellations between November and January, which suggests a reduced cancellation rate during that time.

Distribution of Meal Plan Types by Cancellation Status

hotel_data_plot <- ggplot(hotel_data, aes(x = type_of_meal_plan, fill = canceled)) +
  geom_bar(position="dodge") +
  labs(x = "", y = "", fill = "Canceled") +
  geom_text(stat='count', aes(label=after_stat(count)),position=position_dodge(width = 0.85), vjust=-0.2) +
  theme(legend.position = c(0.98, 0.98),
        legend.justification = c(1, 1))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hotel_waffle_chart <- waffle(prop.table(table(hotel_data$type_of_meal_plan)) * 100,rows=11,reverse = TRUE,size=1.5, legend_pos = "bottom") +
  theme(legend.direction = "vertical")+
  theme(legend.spacing.y = unit(-0.5,"cm"))

grid.arrange(hotel_data_plot, hotel_waffle_chart, ncol = 2, widths = c(2, 1),top="Distribution of Meal Plan Types by Cancellation Status")

The majority of bookings either opt for the first meal plan option or do not select any meal plan at all.

Distribution of Room Types by Cancellation Status

hotel_data_plot <- ggplot(hotel_data, aes(x = room_type_reserved, fill = canceled)) +
  geom_bar(position="dodge") +
  labs(x = "", y = "", fill = "Canceled") +
  geom_text(stat='count', aes(label=after_stat(count)),position=position_dodge(width = 0.9), vjust=-0.5,size =3.1) +
  theme(legend.position = c(0.98, 0.98),
        legend.justification = c(1, 1))+
scale_x_continuous(breaks = hotel_data$room_type_reserved)

hotel_waffle_chart <- waffle(prop.table(table(hotel_data$room_type_reserved)) * 100,rows=11,reverse = TRUE,size=1.5, legend_pos = "bottom") +
  theme(legend.direction = "vertical",
        legend.spacing.y = unit(-0.5,"cm"),
        legend.title = element_blank(),
        legend.text = element_text(size = 10)) +
  guides(fill = guide_legend(override.aes = list(size = 3)))

grid.arrange(hotel_data_plot, hotel_waffle_chart, ncol = 2, widths = c(2, 1),top="Distribution of Room Types Reserved by Cancellation Status")

The majority of clients prefer either the first type of rooms or the fourth type.

Lead Time by number of reservations

ggplot(hotel_data, aes(x = lead_time)) +
  geom_histogram(binwidth = 10,color = "white",fill=custom_blue) +
  labs(x = "Lead Time", y = "Count") +
  ggtitle("Variation of Lead Time")

There is an inversely proportional relationship between the lead time and the number of reservations. As the lead time increases, the number of reservations decreases.

#variation of Lead time by Booking status

ggplot(hotel_data, aes(x = lead_time, fill = canceled, group = canceled)) +
  geom_density(alpha = 0.8) +
  labs(x = "Lead Time", y = "Density", fill = "Canceled") +
  ggtitle("Variation of Lead Time by Booking Status")

A discernible pattern suggests that the probability of cancellations rises with increasing lead times. Conversely, shorter lead times typically result in a higher likelihood of confirmed reservations.

Number of Children & Adults

histogram_adults_data <- ggplot(hotel_data) +
  geom_histogram(aes(x = no_of_adults),binwidth = 1,color="white",fill=custom_red) +
  labs( y = "Count",x="") +
  ggtitle("Distribution of the Number of Adults") +
  theme(text=element_text(size=10))

histogram_children_data <- ggplot(hotel_data) +
  geom_histogram(aes(x = no_of_children),binwidth = 1, color="white",fill=custom_blue) +
  labs(x = "", y = "") +
  coord_cartesian(xlim = c(0, 3)) +
  scale_x_continuous(breaks = seq(0, 10, 1)) +
  ggtitle("Distribution of the Number of Children") +
  theme(text=element_text(size=10))

grid.arrange(histogram_adults_data, histogram_children_data, nrow = 1)

The majority of bookings consist of 2 adults and no children.

Number of Week & Weekend Nights

hist_weekends_night <- ggplot(hotel_data) +
  geom_histogram(aes(x = no_of_weekend_nights), binwidth = 1, color = "white",fill=custom_red) +
  labs(y = "Count", x = "") +
  coord_cartesian(xlim = c(0, 5)) +
  ggtitle("Distribution of Number of Weekend Nights") +
  theme(plot.title = element_text(size = 11))

hist_weekdays_nights <- ggplot(hotel_data) +
  geom_histogram(aes(x = no_of_week_nights), binwidth = 1, color = "white",fill=custom_blue) +
  labs(x = "", y = "") +
  coord_cartesian(xlim = c(0, 11)) +
  ggtitle("Distribution of Number of Week Nights") +
  theme(plot.title = element_text(size = 11))

grid.arrange(hist_weekends_night, hist_weekdays_nights, nrow = 1)

According to the data, a sizable portion of reservations only include weekday stays of one to three days and do not include weekend nights. On the other hand, a sizable percentage of reservations are for the full weekend, suggesting that longer weekend vacations are preferred.

Distribution of Special Requests

ggplot(hotel_data, aes(x = no_of_special_requests)) +
  geom_histogram(binwidth = 1,color = "white",fill=custom_blue) +
  labs(x = "Number of Special Requests", y = "Count") +
  scale_x_continuous(breaks = seq(0, max(hotel_data$no_of_special_requests), 1)) +
  ggtitle("Variation of Special Requests count")

Most customers usually don’t have any particular requests when they make a reservation. A tiny fraction, meanwhile, might have one or two exceptional needs, and in extreme circumstances, up to five special requirements.

Distribution of Recurring Customers

hotel_data_plot_repeated_guest <- ggplot(hotel_data, aes(x = repeated_guest, fill = repeated_guest)) + 
  geom_bar() + 
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.64) +
  scale_fill_manual(values = c(custom_red,custom_blue)) +
  theme_void() +
  theme(legend.position = "none")

hotel_pie_chart_repeated_guest <- ggplot(hotel_data, aes(x = "", fill = repeated_guest)) +
  geom_bar(width = 1) +
  coord_polar(theta = "y") +
  guides(fill = guide_legend(title = "Repeated Guest", ncol = 1)) +
  geom_text(aes(label = paste0(round((after_stat(count)) / sum(after_stat(count)) * 100, 2), "%")),
            stat = "count", 
            position = position_stack(vjust = 0.5)) +
  theme_void() +
  theme(legend.position = "bottom")

grid.arrange(hotel_data_plot_repeated_guest, 
             hotel_pie_chart_repeated_guest, 
             ncol = 2, widths = c(4, 3.5), top = "Distribution of Repeated Guest")

Since they make up about 96.7% of all guests, it is clear from the data that most of them are first-time guests at the hotel. Just 3.3% of the guests are repeat customers who have stayed at the hotel before.

Correlation Test

numerical_data <- hotel_data %>%
  select_if(is.numeric)
numerical_data <- hotel_data[, sapply(hotel_data, is.numeric)]
numerical_data <- Filter(is.numeric, hotel_data)
summary(numerical_data)
##   no_of_adults   no_of_children    no_of_weekend_nights no_of_week_nights
##  Min.   :0.000   Min.   : 0.0000   Min.   :0.0000       Min.   : 0.000   
##  1st Qu.:2.000   1st Qu.: 0.0000   1st Qu.:0.0000       1st Qu.: 1.000   
##  Median :2.000   Median : 0.0000   Median :1.0000       Median : 2.000   
##  Mean   :1.845   Mean   : 0.1052   Mean   :0.8105       Mean   : 2.204   
##  3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.:2.0000       3rd Qu.: 3.000   
##  Max.   :4.000   Max.   :10.0000   Max.   :7.0000       Max.   :17.000   
##    lead_time       arrival_year  arrival_month     arrival_date  
##  Min.   :  0.00   Min.   :2017   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 17.00   1st Qu.:2018   1st Qu.: 5.000   1st Qu.: 8.00  
##  Median : 57.00   Median :2018   Median : 8.000   Median :16.00  
##  Mean   : 85.28   Mean   :2018   Mean   : 7.429   Mean   :15.58  
##  3rd Qu.:126.00   3rd Qu.:2018   3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :443.00   Max.   :2018   Max.   :12.000   Max.   :31.00  
##  no_of_previous_cancellations no_of_previous_bookings_not_canceled
##  Min.   : 0.00000             Min.   : 0.000                      
##  1st Qu.: 0.00000             1st Qu.: 0.000                      
##  Median : 0.00000             Median : 0.000                      
##  Mean   : 0.02335             Mean   : 0.153                      
##  3rd Qu.: 0.00000             3rd Qu.: 0.000                      
##  Max.   :13.00000             Max.   :58.000                      
##  avg_price_per_room no_of_special_requests room_type_reserved type_of_meal_plan
##  Min.   :  0.00     Min.   :0.00           Min.   :1.000      Min.   :0.0000   
##  1st Qu.: 80.30     1st Qu.:0.00           1st Qu.:1.000      1st Qu.:1.0000   
##  Median : 99.45     Median :0.00           Median :1.000      Median :1.0000   
##  Mean   :103.44     Mean   :0.62           Mean   :1.708      Mean   :0.9499   
##  3rd Qu.:120.00     3rd Qu.:1.00           3rd Qu.:1.000      3rd Qu.:1.0000   
##  Max.   :540.00     Max.   :5.00           Max.   :7.000      Max.   :3.0000
standardised_data <- scale(numerical_data)
correlation_hotel_data <- round(cor(numerical_data), 2)
melted_cormat <- melt(correlation_hotel_data)

ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = custom_blue, high = custom_red,
  limit = c(-1,1), name="Correlation") +
  theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
  geom_text(aes(Var2, Var1, label = value),size = 2) +
  labs(x = NULL, y = NULL)

pca <- PCA(standardised_data)

variance <- get_pca_var(pca)
fviz_pca_var(pca, col.var="contrib", gradient.cols = c("black","yellow","purple","red","blue","green","pink","violet","brown","orange"),ggrepel = TRUE ) + labs( title = "PCA Variable Variance")

The correlation calculations clearly show that the target column “canceled” is positively correlated with “lead_time,” “market_segment_type,” and “avg_price_per_room.” On the other hand, “repeated_guest” and “no_of_special_requests” show a negative correlation with the goal feature. Remarkably, though, “type_of_meal_plan,” “arrival_date,” and “arrival_month” exhibit little to no impact on the customer’s choice to cancel their reservation.

hotel_data$market_segment_type <- as.factor(hotel_data$market_segment_type)
hotel_data$canceled <- as.factor(hotel_data$canceled)
groups <- dummyVars(~ market_segment_type + canceled, data = hotel_data)
hotel_data <- cbind(hotel_data, as.data.frame(predict(groups, hotel_data)))
hotel_data[, -c(19)]
numeric_data <- hotel_data %>%
  select_if(is.numeric)
numeric_data <- hotel_data[, sapply(hotel_data, is.numeric)]
numeric_data <- Filter(is.numeric, hotel_data)
numeric_data <- numeric_data[, -c(20)]
summary(numeric_data)
##   no_of_adults   no_of_children    no_of_weekend_nights no_of_week_nights
##  Min.   :0.000   Min.   : 0.0000   Min.   :0.0000       Min.   : 0.000   
##  1st Qu.:2.000   1st Qu.: 0.0000   1st Qu.:0.0000       1st Qu.: 1.000   
##  Median :2.000   Median : 0.0000   Median :1.0000       Median : 2.000   
##  Mean   :1.845   Mean   : 0.1052   Mean   :0.8105       Mean   : 2.204   
##  3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.:2.0000       3rd Qu.: 3.000   
##  Max.   :4.000   Max.   :10.0000   Max.   :7.0000       Max.   :17.000   
##    lead_time       arrival_year  arrival_month     arrival_date  
##  Min.   :  0.00   Min.   :2017   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 17.00   1st Qu.:2018   1st Qu.: 5.000   1st Qu.: 8.00  
##  Median : 57.00   Median :2018   Median : 8.000   Median :16.00  
##  Mean   : 85.28   Mean   :2018   Mean   : 7.429   Mean   :15.58  
##  3rd Qu.:126.00   3rd Qu.:2018   3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :443.00   Max.   :2018   Max.   :12.000   Max.   :31.00  
##  no_of_previous_cancellations no_of_previous_bookings_not_canceled
##  Min.   : 0.00000             Min.   : 0.000                      
##  1st Qu.: 0.00000             1st Qu.: 0.000                      
##  Median : 0.00000             Median : 0.000                      
##  Mean   : 0.02335             Mean   : 0.153                      
##  3rd Qu.: 0.00000             3rd Qu.: 0.000                      
##  Max.   :13.00000             Max.   :58.000                      
##  avg_price_per_room no_of_special_requests room_type_reserved type_of_meal_plan
##  Min.   :  0.00     Min.   :0.00           Min.   :1.000      Min.   :0.0000   
##  1st Qu.: 80.30     1st Qu.:0.00           1st Qu.:1.000      1st Qu.:1.0000   
##  Median : 99.45     Median :0.00           Median :1.000      Median :1.0000   
##  Mean   :103.44     Mean   :0.62           Mean   :1.708      Mean   :0.9499   
##  3rd Qu.:120.00     3rd Qu.:1.00           3rd Qu.:1.000      3rd Qu.:1.0000   
##  Max.   :540.00     Max.   :5.00           Max.   :7.000      Max.   :3.0000   
##  market_segment_type.Aviation market_segment_type.Complementary
##  Min.   :0.000000             Min.   :0.00000                  
##  1st Qu.:0.000000             1st Qu.:0.00000                  
##  Median :0.000000             Median :0.00000                  
##  Mean   :0.003449             Mean   :0.01076                  
##  3rd Qu.:0.000000             3rd Qu.:0.00000                  
##  Max.   :1.000000             Max.   :1.00000                  
##  market_segment_type.Corporate market_segment_type.Offline
##  Min.   :0.00000               Min.   :0.0000             
##  1st Qu.:0.00000               1st Qu.:0.0000             
##  Median :0.00000               Median :0.0000             
##  Mean   :0.05549               Mean   :0.2902             
##  3rd Qu.:0.00000               3rd Qu.:1.0000             
##  Max.   :1.00000               Max.   :1.0000             
##  market_segment_type.Online canceled.TRUE   
##  Min.   :0.00               Min.   :0.0000  
##  1st Qu.:0.00               1st Qu.:0.0000  
##  Median :1.00               Median :0.0000  
##  Mean   :0.64               Mean   :0.3278  
##  3rd Qu.:1.00               3rd Qu.:1.0000  
##  Max.   :1.00               Max.   :1.0000
correlation_data <- round(cor(numeric_data), 2)
melted_cormat_2 <- melt(correlation_data)

ggplot(data = melted_cormat_2, aes(x=Var1, y=Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = custom_blue, high = custom_red,
  limit = c(-1,1), name="Correlation") +
  theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
  geom_text(aes(Var2, Var1, label = value),size = 2) +
  labs(x = NULL, y = NULL)

pca_new <- PCA(numeric_data)

columns_to_extract <- c(12, 5, 11, 6, 20)
DATASET <- data.frame(numeric_data[, columns_to_extract])
features_pca <- PCA(DATASET)

pairs(DATASET)

library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(DATASET)

library(class)
## Warning: package 'class' was built under R version 4.3.2
library(caret)

# Split the dataset into training and testing sets
set.seed(123)  # For reproducibility
train_index <- sample(1:nrow(DATASET), 0.7 * nrow(DATASET))  # 70% for training
train_data <- DATASET[train_index, ]
test_data <- DATASET[-train_index, ]
response_variable_index <- which(names(DATASET) == "canceled.TRUE")
response_variable_index
## [1] 5
# Preprocess the data if necessary (e.g., scaling numeric variables)

# Train the KNN model
k <- 5  
# Number of neighbors
knn_model <- knn(train = train_data[, -response_variable_index], 
             test = test_data[, -response_variable_index], 
             cl = train_data[, response_variable_index], 
             k = k)
confusion_matrix_knn <- table(Actual = test_data$canceled.TRUE, Predicted = knn_model)
print(confusion_matrix_knn)
##       Predicted
## Actual    0    1
##      0 6569  763
##      1 1307 2233
library(caret)
k_values <- seq(1, 25, by = 2) 
train_control <- trainControl(method = "cv", number = 10) 
knn_model_results <- train(form = canceled.TRUE ~ .,
                           data = train_data,
                           method = "knn",
                           trControl = train_control,
                           tuneGrid = expand.grid(k = k_values))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
best_k <- knn_model_results$bestTune$k
final_knn_model <- knn(train = train_data[, -response_variable_index],
                       test = test_data[, -response_variable_index],
                       cl = train_data[, response_variable_index],
                       k = best_k)
best_k
## [1] 9
confusion_matrix_KNN <- table(Actual = test_data$canceled.TRUE, Predicted = final_knn_model)
print(confusion_matrix_KNN)
##       Predicted
## Actual    0    1
##      0 6692  640
##      1 1474 2066
plot(final_knn_model)

library(e1071)

# Train Naive Bayes model
naive_bayes_model <- naiveBayes(canceled.TRUE ~ ., data = train_data)
naive_bayes_model
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.6712923 0.3287077 
## 
## Conditional probabilities:
##    no_of_special_requests
## Y        [,1]      [,2]
##   0 0.7608645 0.8353723
##   1 0.3292156 0.5719549
## 
##    lead_time
## Y       [,1]     [,2]
##   0  59.2170 64.10792
##   1 139.4872 98.42692
## 
##    avg_price_per_room
## Y        [,1]     [,2]
##   0  99.99582 35.58241
##   1 110.55594 32.55786
## 
##    arrival_year
## Y       [,1]      [,2]
##   0 2017.775 0.4176603
##   1 2017.919 0.2733338
# Make predictions on the test dataset
predictions <- predict(naive_bayes_model, newdata = test_data)

# Build confusion matrix
confusion_matrix_nb <- table(Actual = test_data$canceled.TRUE, Predicted = predictions)
print(confusion_matrix_nb)
##       Predicted
## Actual    0    1
##      0 6543  789
##      1 1695 1845
predicted_prob <- predict(naive_bayes_model, newdata = test_data, type="raw") 
predicted_class <- predict(naive_bayes_model, newdata = test_data) 
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(test_data$canceled.TRUE,predicted_prob[,1])
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
## 
## Call:
## roc.default(response = test_data$canceled.TRUE, predictor = predicted_prob[,     1])
## 
## Data: predicted_prob[, 1] in 7332 controls (test_data$canceled.TRUE 0) > 3540 cases (test_data$canceled.TRUE 1).
## Area under the curve: 0.8022
plot.roc(test_data$canceled.TRUE,predicted_prob[,1],print.thres="best")
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases

# Naive Bayes model
# Calculate accuracy
accuracy_nb <- sum(diag(confusion_matrix_nb)) / sum(confusion_matrix_nb)

# Calculate precision
precision_nb <- confusion_matrix_nb[2, 2] / sum(confusion_matrix_nb[, 2])

# Calculate recall (sensitivity)
recall_nb <- confusion_matrix_nb[2, 2] / sum(confusion_matrix_nb[2, ])

# Calculate F1 score
f1_score_nb <- 2 * (precision_nb * recall_nb) / (precision_nb + recall_nb)

# Print metrics for Naive Bayes model
cat("Naive Bayes Model:\n")
## Naive Bayes Model:
cat("Accuracy:", accuracy_nb, "\n")
## Accuracy: 0.7715232
cat("Precision:", precision_nb, "\n")
## Precision: 0.7004556
cat("Recall (Sensitivity):", recall_nb, "\n")
## Recall (Sensitivity): 0.5211864
cat("F1 Score:", f1_score_nb, "\n\n")
## F1 Score: 0.5976676
# KNN model
# Calculate accuracy
accuracy_knn <- sum(diag(confusion_matrix_KNN)) / sum(confusion_matrix_KNN)

# Calculate precision
precision_knn <- confusion_matrix_KNN[2, 2] / sum(confusion_matrix_KNN[, 2])

# Calculate recall (sensitivity)
recall_knn <- confusion_matrix_KNN[2, 2] / sum(confusion_matrix_KNN[2, ])

# Calculate F1 score
f1_score_knn <- 2 * (precision_knn * recall_knn) / (precision_knn + recall_knn)

# Print metrics for KNN model
cat("KNN Model:\n")
## KNN Model:
cat("Accuracy:", accuracy_knn, "\n")
## Accuracy: 0.8055556
cat("Precision:", precision_knn, "\n")
## Precision: 0.7634885
cat("Recall (Sensitivity):", recall_knn, "\n")
## Recall (Sensitivity): 0.5836158
cat("F1 Score:", f1_score_knn, "\n")
## F1 Score: 0.6615434

#comparing the model performance

comparison_df <- data.frame(
  Classifier = c("KNN", "Naive Bayes"),
  Accuracy = c(accuracy_knn, accuracy_nb),
  Precision = c(precision_knn, precision_nb),
  Recall = c(recall_knn, recall_nb),
  F1_Score = c(f1_score_knn, f1_score_nb)
)
print(comparison_df)
##    Classifier  Accuracy Precision    Recall  F1_Score
## 1         KNN 0.8055556 0.7634885 0.5836158 0.6615434
## 2 Naive Bayes 0.7715232 0.7004556 0.5211864 0.5976676
library(ggplot2)
library(reshape2)
comparison_df_melted <- melt(comparison_df, id.vars = "Classifier")
ggplot(comparison_df_melted, aes(x = variable, y = value, fill = Classifier)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  labs(title = "Comparison of Classifiers",
       x = "Metric",
       y = "Value",
       fill = "Classifier") +
  theme_minimal()